library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(hms)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
## 
##     hms
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(httr)
## 
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
## 
##     config
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE, 
  fig.width = 6, 
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "virids"
)

scale_colour_discrete = scale_color_viridis_d
scale_fill_discrete = scale_fill_viridis_d
years_1 <- c(1900:2012, 2014)
years_2 <- c(2015:2019)

importing_data = function(x){
 
  if(str_detect(x, str_c(years_1, collapse = "|"))) {
  read_csv(x, na = c("NULL", "", "0"), col_types = "cicccciiiicc") 
  } 
  
  else if(str_detect(x, str_c(years_2, collapse = "|"))){
    read_csv(x, na = c("NULL", "", "0"), col_types = "cccicccccccccccccccccciiiiccc")
  }
}

boston_df <- 
  tibble(list.files("data", full.names = TRUE)) %>% 
  setNames("file_name") %>% 
  mutate(data = map(file_name, importing_data)) %>% 
  unnest(data) %>% 
  mutate(year = readr::parse_number(file_name),
         city = coalesce(city, residence),
         display_name = str_replace_all(display_name, "[^a-zA-Z0-9]", " ")) %>% 
  filter(!is.na(display_name)) %>% 
  select(-file_name, -residence)

do you need hms or is it part of tidyverse?

winners_df = 
  boston_df %>% 
    mutate(
      year = as.factor(year),
      official_time = as_hms(official_time),
      pace = as_hms(pace),
      place_overall = as.numeric(place_overall)
      )

Make winners over time plot

winners_df %>% 
  filter(overall == 1) %>% 
  arrange(year) %>% 
ggplot(aes(x = year, y = official_time, group = 1)) +
  geom_point() +
  geom_path() + 
 scale_x_discrete(breaks = c(1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020))

winners_df %>%
  filter(overall == 1) %>% 
  arrange(official_time)
## # A tibble: 125 × 30
##    display_name               age gender pace   official_time overall gender_result
##    <chr>                    <int> <chr>  <time> <time>          <int>         <int>
##  1 Jim Knaub                   NA M      03'08" 01:22:17            1             1
##  2 Franz Nietlispach           NA M      03'16" 01:25:59            1             1
##  3 Geoffrey Mutai              29 M      04'41" 02:03:02            1             1
##  4 Martin E Duffy              NA M      04'45" 02:04:54            1             1
##  5 Robert Kiprono Cheruiyot    21 M      04'48" 02:05:52            1             1
##  6 Robert Cheruiyot            27 M      04'51" 02:07:14            1             1
##  7 Cosmas Ndeti                NA M      04'51" 02:07:15            1             1
##  8 Moses Tanui                 NA M      04'51" 02:07:34            1             1
##  9 Steve Scannapieco           29 M      04'52" 02:07:46            1             1
## 10 Valerie Park                NA M      04'52" 02:07:46            1             1
## # … with 115 more rows, and 23 more variables: division_result <int>,
## #   seconds <int>, first_name <chr>, last_name <chr>, place_overall <dbl>,
## #   bib <chr>, name <chr>, city <chr>, state <chr>, country_residence <chr>,
## #   contry_citizenship <chr>, name_suffix <chr>, 5k <chr>, 10k <chr>,
## #   15k <chr>, 20k <chr>, half <chr>, 25k <chr>, 30k <chr>, 35k <chr>,
## #   40k <chr>, projected_time <chr>, year <fct>

Jim Knaub and Franz Nietlispach reported as having an official time of 1:22:17 and 1:25:59 but 1:59 is the fastest marthon ever ran (though not recorded officially)

–> errors in data (~1:20:00 is fastes here but 1:59 is fasted – but not recorded – https://www.nytimes.com/2019/10/12/sports/eliud-kipchoge-marathon-record.html)

Plotly

Fixing time variable – plotly can’t use lubridate

plotly_win_df = 
  boston_df %>% 
    mutate(
      official_time = as.POSIXct(official_time, format = "%H:%M:%OS"), 
      year = as.factor(year),
      pace = as.POSIXct(pace, format = "%H:%M:%OS"),
      place_overall = as.numeric(place_overall)
    )

making new plotly with reformatted time

boston_winners = 
plotly_win_df %>% 
  filter(overall == 1) %>% 
  plot_ly(x = ~year, y = ~format(official_time, "%H:%M:%OS"), 
          mode = 'lines', type = 'scatter',
          name = 'Boston Winners',
          hoverinfo = "text",
          text = 
            ~paste0("Name: ", display_name,
                    "\n", "Year: ", year,
            "\n", "Time: ", format(official_time, "%H:%M:%OS"),
            "\n", "Pace: ", format(pace, "%H:%M:%OS"))) %>% 
  layout(
    title = "Boston Marathon Winners by Year",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Official Time"))

Marathon records – adding new dataset

records_html = 
  read_html("https://www.topendsports.com/sport/athletics/record-marathon.htm")


record_marathon =
  records_html %>% 
  html_nodes("table") %>% 
  html_table(fill = T) %>% 
  lapply(., function(x) setNames(x, c("time", "date", "athlete", "country", "marathon"))) 

marathon = 
record_marathon %>% 
  as.data.frame() %>% 
  mutate(
    time = as_hms(time),
  ) %>% 
  separate(date, into = c("month", "day", "year")) %>% 
    mutate(year = as.numeric(year)) %>% 
    select(-month, -day)

marathon %>% 
plot_ly(type = 'scatter', mode = 'lines', text = ~paste('Name: ', athlete)) %>% 
  add_trace(x = ~year, y = ~time)%>% 
  layout(yaxis = list(categoryorder = "array", categoryarray = winners_df$official_time))
library(purrr)
library(lubridate)

age x year (intervals?) pace x year plot? Boston winner compared to record winner overall -